{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature: Out-Of-Fold Predictions and Feature Layer Activations from an LSTM"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In addition to the output of the final network layer, the model will also output the activations of the intermediate feature layer.\n",
    "\n",
    "To achieve this, we'll create a multi-output network (target output + activations output), and supply dummy ground truth and a dummy loss function to the second output."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This utility package imports `numpy`, `pandas`, `matplotlib` and a helper `kg` module into the root namespace."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pygoose import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from keras import backend as K\n",
    "from keras.models import Model, Sequential\n",
    "from keras.layers import *\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Automatically discover the paths to various data folders and compose the project structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "project = kg.Project.discover()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Identifier for storing these features on disk and referring to them later."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feature_list_id = 'oofp_nn_lstm_with_activations'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Make subsequent NN runs reproducible."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "RANDOM_SEED = 42"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "np.random.seed(RANDOM_SEED)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Read data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Word embedding lookup matrix."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "embedding_matrix = kg.io.load(project.aux_dir + 'fasttext_vocab_embedding_matrix.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Padded sequences of word indices for every question."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_train.pickle')\n",
    "X_train_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_train.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_test_q1 = kg.io.load(project.preprocessed_data_dir + 'sequences_q1_fasttext_test.pickle')\n",
    "X_test_q2 = kg.io.load(project.preprocessed_data_dir + 'sequences_q2_fasttext_test.pickle')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_train = kg.io.load(project.features_dir + 'y_train.pickle')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Word embedding properties."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "EMBEDDING_DIM = embedding_matrix.shape[-1]\n",
    "VOCAB_LENGTH = embedding_matrix.shape[0]\n",
    "MAX_SEQUENCE_LENGTH = X_train_q1.shape[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(EMBEDDING_DIM, VOCAB_LENGTH, MAX_SEQUENCE_LENGTH)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def zero_loss(y_true, y_pred):\n",
    "    return K.zeros((1,))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_model_question_branch():\n",
    "    input_q = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
    "    \n",
    "    embedding_q = Embedding(\n",
    "        VOCAB_LENGTH,\n",
    "        EMBEDDING_DIM,\n",
    "        weights=[embedding_matrix],\n",
    "        input_length=MAX_SEQUENCE_LENGTH,\n",
    "        trainable=False,\n",
    "    )(input_q)\n",
    "\n",
    "    timedist_q = TimeDistributed(Dense(\n",
    "        EMBEDDING_DIM,\n",
    "        activation='relu',\n",
    "    ))(embedding_q)\n",
    "\n",
    "    lambda_q = Lambda(\n",
    "        lambda x: K.max(x, axis=1),\n",
    "        output_shape=(EMBEDDING_DIM, )\n",
    "    )(timedist_q)\n",
    "    \n",
    "    output_q = lambda_q\n",
    "    return input_q, output_q"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def create_model(params):    \n",
    "    embedding_layer = Embedding(\n",
    "        VOCAB_LENGTH,\n",
    "        EMBEDDING_DIM,\n",
    "        weights=[embedding_matrix],\n",
    "        input_length=MAX_SEQUENCE_LENGTH,\n",
    "        trainable=False,\n",
    "    )\n",
    "    lstm_layer = LSTM(\n",
    "        params['num_lstm'],\n",
    "        dropout=params['lstm_dropout_rate'],\n",
    "        recurrent_dropout=params['lstm_dropout_rate'],\n",
    "    )\n",
    "\n",
    "    input_q1 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
    "    embedded_sequences_1 = embedding_layer(input_q1)\n",
    "    x1 = lstm_layer(embedded_sequences_1)\n",
    "\n",
    "    input_q2 = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
    "    embedded_sequences_2 = embedding_layer(input_q2)\n",
    "    y1 = lstm_layer(embedded_sequences_2)\n",
    "\n",
    "    features = Concatenate(name='feature_output')([x1, y1])\n",
    "    dropout_feat = Dropout(params['dense_dropout_rate'])(features)\n",
    "    bn_feat = BatchNormalization()(dropout_feat)\n",
    "\n",
    "    dense_1 = Dense(params['num_dense'], activation='relu')(bn_feat)\n",
    "    dropout_1 = Dropout(params['dense_dropout_rate'])(dense_1)\n",
    "    bn_1 = BatchNormalization()(dropout_1)\n",
    "\n",
    "    output = Dense(1, activation='sigmoid', name='target_output')(bn_1)\n",
    "\n",
    "    model = Model(\n",
    "        inputs=[input_q1, input_q2],\n",
    "        outputs=[output, features],\n",
    "    )\n",
    "    \n",
    "    model.compile(\n",
    "        loss={'target_output': 'binary_crossentropy', 'feature_output': zero_loss},\n",
    "        loss_weights={'target_output': 1.0, 'feature_output': 0.0},\n",
    "        optimizer='nadam',\n",
    "        metrics=None,\n",
    "    )\n",
    "\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def predict(model, X_q1, X_q2):\n",
    "    \"\"\"\n",
    "    Mirror the pairs, compute two separate predictions, and average them.\n",
    "    \"\"\"\n",
    "    \n",
    "    y1 = model.predict([X_q1, X_q2], batch_size=1024, verbose=1).reshape(-1)   \n",
    "    y2 = model.predict([X_q2, X_q1], batch_size=1024, verbose=1).reshape(-1)    \n",
    "    return (y1 + y2) / 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Partition the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "NUM_FOLDS = 5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "kfold = StratifiedKFold(\n",
    "    n_splits=NUM_FOLDS,\n",
    "    shuffle=True,\n",
    "    random_state=RANDOM_SEED\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define hyperparameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "BATCH_SIZE = 2048"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "MAX_EPOCHS = 200"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Best values picked by Bayesian optimization."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model_params = {\n",
    "    'dense_dropout_rate': 0.075,\n",
    "    'lstm_dropout_rate': 0.332,\n",
    "    'num_dense': 130,\n",
    "    'num_lstm': 300,\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feature_output_size = model_params['num_lstm'] * 2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create placeholders for out-of-fold predictions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train_oofp = np.zeros_like(y_train, dtype='float32')\n",
    "y_train_oofp_features = np.zeros((len(y_train), feature_output_size), dtype='float32')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "y_test_oofp = np.zeros((len(X_test_q1), NUM_FOLDS), dtype='float32')\n",
    "y_test_oofp_features = np.zeros((len(X_test_q1), feature_output_size), dtype='float32')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The path where the best weights of the current model will be saved."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model_checkpoint_path = project.temp_dir + 'fold-checkpoint-' + feature_list_id + '.h5'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Fit the folds and compute out-of-fold predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "# Iterate through folds.\n",
    "for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train_q1, y_train)):\n",
    "    \n",
    "    # Augment the training set by mirroring the pairs.\n",
    "    X_fold_train_q1 = np.vstack([X_train_q1[ix_train], X_train_q2[ix_train]])\n",
    "    X_fold_train_q2 = np.vstack([X_train_q2[ix_train], X_train_q1[ix_train]])\n",
    "\n",
    "    X_fold_val_q1 = np.vstack([X_train_q1[ix_val], X_train_q2[ix_val]])\n",
    "    X_fold_val_q2 = np.vstack([X_train_q2[ix_val], X_train_q1[ix_val]])\n",
    "\n",
    "    # Ground truth should also be \"mirrored\".\n",
    "    y_fold_train = np.concatenate([y_train[ix_train], y_train[ix_train]])\n",
    "    y_fold_val = np.concatenate([y_train[ix_val], y_train[ix_val]])\n",
    "    \n",
    "    print()\n",
    "    print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')\n",
    "    print()\n",
    "    \n",
    "    # Compile a new model.\n",
    "    model = create_model(model_params)\n",
    "\n",
    "    # Train.\n",
    "    model.fit(\n",
    "        # Create dummy ground truth values for the activation outputs.\n",
    "        [X_fold_train_q1, X_fold_train_q2],\n",
    "        [y_fold_train, np.zeros((len(y_fold_train), feature_output_size))],\n",
    "        \n",
    "        validation_data=(\n",
    "            [X_fold_val_q1, X_fold_val_q2],\n",
    "            [y_fold_val, np.zeros((len(y_fold_val), feature_output_size))],\n",
    "        ),\n",
    "\n",
    "        batch_size=BATCH_SIZE,\n",
    "        epochs=MAX_EPOCHS,\n",
    "        verbose=1,\n",
    "        \n",
    "        callbacks=[\n",
    "            # Stop training when the validation loss stops improving.\n",
    "            EarlyStopping(\n",
    "                monitor='val_loss',\n",
    "                min_delta=0.001,\n",
    "                patience=3,\n",
    "                verbose=1,\n",
    "                mode='auto',\n",
    "            ),\n",
    "            # Save the weights of the best epoch.\n",
    "            ModelCheckpoint(\n",
    "                model_checkpoint_path,\n",
    "                monitor='val_loss',\n",
    "                save_best_only=True,\n",
    "                verbose=2,\n",
    "            ),\n",
    "        ],\n",
    "    )\n",
    "        \n",
    "    # Restore the best epoch.\n",
    "    model.load_weights(model_checkpoint_path)\n",
    "    \n",
    "    # Compute out-of-fold predictions.\n",
    "    y_train_oofp[ix_val] = predict(model, X_train_q1[ix_val], X_train_q2[ix_val])\n",
    "    y_test_oofp[:, fold_num] = predict(model, X_test_q1, X_test_q2)\n",
    "    \n",
    "    # Clear GPU memory.\n",
    "    K.clear_session()\n",
    "    del X_fold_train_q1, X_fold_train_q2\n",
    "    del X_fold_val_q1, X_fold_val_q2\n",
    "    del model\n",
    "    gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_score = log_loss(y_train, y_train_oofp)\n",
    "print('CV score:', cv_score)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "feature_names = [feature_list_id]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "features_train = y_train_oofp.reshape((-1, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "features_test = np.mean(y_test_oofp, axis=1).reshape((-1, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "project.save_features(features_train, features_test, feature_names, feature_list_id)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
